Load packages
library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill = NA),
text = element_text(size = 20),
strip.placement = "outside",
strip.text = element_text(size = 20, color = "black"),
strip.background = element_rect(fill = "white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(GenomicRanges)
library(openxlsx)
require(ggraph)
require(igraph)
source("scripts/functions.R")
source("../motif-analysis/mta_downstream_functions.R")
Data directories and metadata
atac_dir <- file.path(
"ATACSEQ", "nucleosome_free_regions", "results"
)
cns_dir <- file.path(
"ATACSEQ", "nucleosome_free_regions", "consensus_peaks"
)
bnd_dir <- file.path(
"ATACSEQ", "nucleosome_free_regions", "footprint", "BINDetect"
)
rna_dir <- file.path(
"RNASEQ_QUANTIFICATION", "results"
)
pub_dir <- file.path("published")
res_dir <- file.path("results")
fig_dir <- file.path("plots")
for (newdir in c(res_dir, fig_dir))
dir.create(newdir, showWarnings = FALSE)
# metadata
des_fn <- file.path("RNASEQ_QUANTIFICATION", "design_table.tsv")
des_dt <- fread(des_fn)
setnames(des_dt, "reporter_line", "reporterline")
col_dt <- des_dt[, .(sample, reporterline, condition)]
col_dt[, reporterline := factor(
reporterline,
levels = c("Elav", "Fox", "Ncol")
)]
col_dt[, condition := factor(condition, levels = c("negative", "positive"))]
setorder(col_dt, reporterline, condition)
# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols <- c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
line_cond_cols <- c(
"Elav_pos" = "#ff7f00", "Fox_pos" = "#984ea3", "Ncol_pos" = "#4daf4a",
"Elav_neg" = "#ebbd8f", "Fox_neg" = "#d18adb", "Ncol_neg" = "#90d18e"
)
Gene annotations and lists
marks_gold <- fread(
file.path("annotation", "golden-marks-231124.tsv"),
fill = TRUE, sep = "\t", header = FALSE
)[, c(2:1)]
setnames(marks_gold, c("gene", "name"))
marks_gold_v <- structure(marks_gold$name, names = marks_gold$gene)
tfs_annt <- fread(
file.path("annotation", "tfs.Nvec.tsv"),
header = TRUE
)[, .SD[1], gene]
setnames(tfs_annt, c("gene", "name", "pfam"))
gene_annt <- fread(
file.path("annotation", "Nvec_annotation_v3_2020-10-23_DToL_names"),
select = 1:3
)
setnames(gene_annt, c("gene", "pfam", "name"))
gene_ann <- rbindlist(list(
gene_annt[!gene %in% tfs_annt$gene],
tfs_annt
), use.names = TRUE)
gene_ann[gene %in% marks_gold$gene, name := marks_gold_v[gene]]
Load normalized gene expression (RNA-seq) and motif accessibility (ATAC-seq) data
# GENES
# size norm gene expression
exps_mt <- read.table(
file.path(rna_dir, "mat_norm.tsv")
)
# size and gene norm gene expression
expr_mt <- read.table(
file.path(rna_dir, "norm_expression.tsv")
)
# gene expression TPMs
tpms_mt <- read.table(
file.path(rna_dir, "tpm_expression.tsv")
)
# gene clusters
gene_clust <- fread(
file.path(rna_dir, "genes_clusters.tsv")
)
# PEAKS
# peak accessibility
accs_mt <- read.table(
file.path(atac_dir, "mat_norm.tsv")
)
# peak to gene assignment
assign <- fread(
file.path(atac_dir, "consensusSeekeR-peaks-gene-assignment.tsv")
)
# peak clusters
pks_clust <- fread(
file.path(atac_dir, "consensusSeekeR-peaks-clusters.bed")
)
bed_cols <- c(
"seqnames", "start", "end", "peak", "score", "strand", "cluster", "group"
)
setnames(pks_clust, bed_cols)
# MOTIFS
# motif enrichment scores
motf_dt <- fread(
file.path(atac_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(motf_dt, "label", "group")
# motif enrichment qvalue
motf_qv <- read.table(
file.path(atac_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
sep = "\t"
)
# motif enrichment log2fc
motf_fc <- read.table(
file.path(atac_dir, "motif-enrich-archetypes-mona-fc.tsv"),
sep = "\t"
)
# motif to tf assignment
dc <- fread(file.path(
"annotation",
"assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]
Load fold changes for expression and motif binding
# binding comparisons fold changes for best correlated motifs
dp_dt <- fread(file.path(
bnd_dir, "bindetect_results_reporterline.txt"
))
dp_dt[, id := paste(reporterline, vs, sep = "_vs_")]
dp_dt[, motif_gene := paste(motif, gene, sep = "__")]
dt_atac <- dcast.data.table(
dp_dt, motif_gene ~ id, value.var = "log2FoldChange"
)
mt_atac <- as.matrix(dt_atac[, -1])
rownames(mt_atac) <- dt_atac$motif_gene
# RNA comparisons fold changes
mt_rna <- read.table(
file.path(rna_dir, "log2fc_expression.tsv"),
sep = "\t",
header = TRUE
)
Dot plot of expression and binding score
dt_rna_plot <- melt.data.table(
as.data.table(mt_rna, keep.rownames = "gene"),
id.vars = "gene", variable.name = "comparison", value.name = "rna_fc"
)
dt_atac_plot <- melt.data.table(
as.data.table(mt_atac, keep.rownames = "motif_gene"),
id.vars = "motif_gene", variable.name = "comparison", value.name = "atac_fc"
)
dt_atac_plot[, gene := str_remove(motif_gene, "ARCH\\d+__")]
dt_atac_plot[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_comb <- merge.data.table(
dt_rna_plot, dt_atac_plot,
by = c("gene", "comparison")
)
# filtering
ids_rna <- rownames(mt_rna)[apply(mt_rna, 1, function(x) max(x) > 1.5)]
ids_atac <- rownames(mt_atac)[apply(mt_atac, 1, function(x) max(x) > 0.1)]
dt_plot <- dt_comb[gene %in% ids_rna & motif_gene %in% ids_atac]
# add gene annotation
dt_plot <- merge.data.table(
dt_plot, gene_ann, by = "gene",
all.x = TRUE, sort = FALSE
)
# label for plotting
dt_plot[, label := paste(
substr(name, 1, 30), gene, motif,
sep = " | "
)]
# clustering
tfs <- unique(dt_plot$gene)
hc <- hclust(dist(mt_rna[tfs, ]), method = "ward.D2")
ord <- tfs[hc$order]
# ordering
tfs <- unique(dt_plot$gene)
mord <- order(apply(mt_rna[tfs,], 1, which.max))
ord <- tfs[mord]
# order genes
dt_plot[, gene := factor(gene, levels = rev(ord))]
setorder(dt_plot, gene)
dt_plot[, label := factor(label, levels = unique(dt_plot$label))]
setorder(dt_plot, label)
# limits for plotting
dt_plot[rna_fc < 0, rna_fc := 0]
dt_plot[rna_fc > 8, rna_fc := 8]
# plot
require(ggplot2)
require(scales)
gp_dot <- ggplot(dt_plot, aes(comparison, label)) +
geom_point(
aes(size = rna_fc, fill = atac_fc),
shape = 21,
color = "black"
) +
scale_fill_gradientn(
colours = colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(1000),
# limits = c(-5, 5), oob = squish, breaks = c(-5, 0, 5),
name = "motif binding\nlog2(fold change)",
) +
scale_size_continuous(
# range = c(1, 7),
# breaks = c(0, 2, 4, 6),
name = "gene expression\nlog2(fold change)"
) +
theme(
panel.grid.major = element_line(size = 0.25),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
legend.direction = "vertical"
)
gp_dot
Select correlated genes
mt_bnd <- copy(mt_atac)
rownames(mt_bnd) <- str_remove(rownames(mt_bnd), "ARCH\\d+__")
mt_exp <- mt_rna[rownames(mt_bnd), colnames(mt_bnd)]
rownames(mt_bnd) <- rownames(mt_atac)
rownames(mt_exp) <- rownames(mt_atac)
# correlation between same comparisons (columns)
mt_cors_smp <- cor(mt_bnd, mt_exp, method = "spearman")
diag(mt_cors_smp)
## Elav_vs_Fox Elav_vs_Ncol Elav_vs_neg Fox_vs_Elav Fox_vs_Ncol Fox_vs_neg
## 0.1808882 0.1054204 0.3131112 0.1808882 0.1976934 0.2128027
## Ncol_vs_Elav Ncol_vs_Fox Ncol_vs_neg
## 0.1054204 0.1976934 0.1407049
# correlation between genes (rows)
mt_cors_gen <- cor(t(mt_bnd), t(mt_exp), method = "spearman")
cors_gen <- diag(mt_cors_gen)
names(cors_gen) <- rownames(mt_cors_gen)
cors_gen <- sort(cors_gen, decreasing = TRUE)
# select correlated genes
select_pos <- names(cors_gen[(cors_gen) > 0.5])
# correlation distribution
dt_cors <- data.table(cor = cors_gen)
dt_cors$motif_gene <- names(cors_gen)
dt_cors[, correlation := "no"]
dt_cors[motif_gene %in% select_pos, correlation := "positive"]
dt_cors[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_cors[, gene := str_remove(motif_gene, "ARCH\\d+__")]
# plot
gp_hist <- ggplot(dt_cors, aes(cor, fill = correlation)) +
geom_histogram(color = "black", binwidth = 0.05) +
scale_y_continuous(expand = expansion(c(0, 0.1))) +
scale_fill_manual(values = c(
"positive" = "green",
"no" = "grey"
)) +
labs(x = "correlation", y = "number of genes") +
theme(legend.position = "none")
gp_hist
Inspect correlations between gene expression and motif binding score per sample
mt_ord <- cbind(mt_exp, mt_bnd)
colnames(mt_ord) <- paste(
colnames(mt_ord),
rep(c("RNA", "ATAC"), each = 9),
sep = "_"
)
mt_dt <- as.data.table(mt_ord, keep.rownames = "motif_gene")
mt_dt[, motif := str_extract(motif_gene, "ARCH\\d+")]
mt_dt[, gene := str_remove(motif_gene, "ARCH\\d+__")]
mt_dt[, motif_gene := NULL]
dt_atac <- melt.data.table(
mt_dt,
id.vars = c("gene", "motif"),
measure.vars = grep("ATAC", colnames(mt_dt), value = TRUE),
value.name = "ATAC", variable.name = "comparison"
)
dt_atac[, comparison := str_remove(comparison, "_ATAC")]
dt_rna <- melt.data.table(
mt_dt,
id.vars = "gene",
measure.vars = grep("RNA", colnames(mt_dt), value = TRUE),
value.name = "RNA", variable.name = "comparison"
)
dt_rna[, comparison := str_remove(comparison, "_RNA")]
dt_comp <- merge.data.table(
dt_rna, dt_atac,
by = c("gene", "comparison"), all = TRUE
)
setcolorder(dt_comp, c("motif", "gene", "ATAC", "RNA"))
dt_comp[, reporterline := str_extract(comparison, "Elav|Fox|Ncol")]
dt_comp[, vs := str_extract(comparison, "vs.*")]
dt_comp[, vs := str_remove(vs, "vs_")]
dt_comp <- merge.data.table(
dt_comp, dt_cors, by = c("motif", "gene"),
all.x = TRUE, sort = FALSE
)
# add gene annotation
dt_comp <- merge.data.table(
dt_comp, gene_ann, by = "gene",
all.x = TRUE, sort = FALSE
)
# save
fwrite(
dt_comp,
file.path(res_dir, "correlation_expression_binding.tsv"),
sep = "\t"
)
# label for plotting
dt_comp[, label := substr(name, 1, 20)]
# posterboys to label
dt_lab <- dt_comp[cor > 0.5 & abs(RNA) > 1 & abs(ATAC) > 0.1]
# plot
gp_comp <- ggplot(dt_comp, aes(RNA, ATAC, color = correlation)) +
geom_point(data = dt_comp[correlation == "no"], alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE) +
geom_text_repel(
data = dt_lab[ATAC > 0],
aes(label = label),
color = "black",
size = 3,
segment.color = "grey50",
min.segment.length = 0,
nudge_y = 0.5 - dt_lab[ATAC > 0]$ATAC,
) +
geom_text_repel(
data = dt_lab[ATAC < 0],
aes(label = label),
color = "black",
size = 3,
segment.color = "grey50",
min.segment.length = 0,
nudge_y = -0.5 - dt_lab[ATAC < 0]$ATAC,
) +
geom_point(data = dt_comp[correlation != "no"], alpha = 0.8) +
scale_color_manual(values = c(
"positive" = "green",
"no" = "grey"
)) +
scale_y_continuous(expand = c(0.1, 0.1)) +
facet_grid(vs ~ reporterline) +
theme(legend.position = "bottom")
gp_comp
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Inspect correlation of binding and expression per gene
# binding correlations with expression
dt_comp <- fread(
file.path(res_dir, "correlation_expression_binding.tsv")
)
dt_comp_cor <- unique(
dt_comp[correlation != "no"][, .(motif, gene, reporterline)]
)
# for individual genes
comp_marks <- dt_comp[gene %in% marks_gold$gene]
comp_marks_gp <- ggplot(comp_marks, aes(ATAC, RNA, color = reporterline, shape = vs)) +
scale_color_manual(values = line_cols) +
scale_shape_manual(values = c("Elav" = 15, "Fox" = 16, "Ncol" = 17, "neg" = 18)) +
facet_wrap("name") +
geom_hline(yintercept = 0, linetype = 2, size = 0.5) +
geom_vline(xintercept = 0, linetype = 2, size = 0.5) +
geom_point(size = 2) +
ggpubr::stat_cor(method = "pearson", label.x = -0.5, label.y = 10, color = "black") +
theme(legend.position = "bottom") +
labs(x = "motif binding logFC", y = "expression logFC")
comp_marks_gp
## Warning: Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Computation failed in `stat_cor()`
## Caused by error in `cor.test.default()`:
## ! not enough finite observations
Load baseline networks from BINDetect bound targets that are also expressed in given sample, and subset for those TFs for which binding and expression are correlated.
# binding correlations with expression
dt_comp <- fread(
file.path(res_dir, "correlation_expression_binding.tsv")
)
# binding targets
mo_dt <- fread(
file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
mo_dt <- mo_dt[expressed == TRUE & target_expressed == TRUE]
mo_dt <- mo_dt[grep("pos", sample)]
mo_dt <- mo_dt[expression_lfc > 0]
mo_dt[, reporterline := str_remove(sample, "_pos")]
# add TF-target gene expression correlation
g_cors_dt <- unique(mo_dt[, .(gene, target_gene)])
g_cors_dt <- g_cors_dt[gene %in% rownames(tpms_mt)]
g_cors_dt <- g_cors_dt[target_gene %in% rownames(tpms_mt)]
g_cors_dt[, expression_correlation := cor(
unlist(tpms_mt[gene, ]),
unlist(tpms_mt[target_gene, ])
), by = 1:nrow(g_cors_dt)]
mo_dt <- merge.data.table(
mo_dt, g_cors_dt, by = c("gene", "target_gene"),
all.x = TRUE, sort = FALSE
)
mo_dt <- mo_dt[!is.na(expression_correlation)]
mo_dt[, c("expressed", "target_expressed") := NULL]
# save
fwrite(
mo_dt,
file.path(res_dir, "network.tsv.gz"),
sep = "\t"
)
# select correlated motifs
mo_dt_cor <- merge.data.table(
dt_comp_cor, mo_dt, by = c("motif", "gene", "reporterline"), sort = FALSE
)
# save
fwrite(
mo_dt_cor,
file.path(res_dir, "network_correlation.tsv.gz"),
sep = "\t"
)
Filtering by expression
mo_dt <- fread(file.path(res_dir, "network.tsv.gz"))
mo_dt[, reporterline := NULL]
mo_dt[, target_TF := ifelse(target_gene %in% .SD$gene, TRUE, FALSE), sample]
mo_dt[, name := substr(name, 1, 30)]
mo_dt[, pfam := substr(pfam, 1, 30)]
mo_dt[, target_name := substr(target_name, 1, 30)]
mo_dt[, target_pfam := substr(target_pfam, 1, 30)]
mo_dt <- mo_dt[expression_lfc > 1.5][target_expression_lfc > 1]
setcolorder(mo_dt, c(
"sample", "gene", "name", "pfam",
"expression_tpm", "expression_lfc",
"motif", "motif_name",
"target_gene", "target_name", "target_pfam",
"target_expression_tpm", "target_expression_lfc",
"expression_correlation", "n_samples"
))
How many target genes for each TF?
hits_dt <- mo_dt[, .N, .(gene, motif, sample)]
hits_gp <- ggplot(hits_dt, aes(sample, N, fill = sample)) +
geom_boxplot(outlier.color = NA) +
ggbeeswarm::geom_beeswarm(shape = 21) +
scale_y_continuous(trans = "log10") +
scale_fill_manual(values = line_cond_cols) +
labs(y = "target genes per TF") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "none",
panel.grid.major.y = element_line(linewidth = 0.5),
panel.grid.minor.y = element_line(linewidth = 0.2)
) +
geom_text(
data = hits_dt[,.N,sample],
aes(label = N, y = 15)
)
hits_gp
Save network files
require(openxlsx)
wb <- createWorkbook()
for (smp in c("Elav", "Ncol", "Fox")) {
smp_fn <- file.path(res_dir, sprintf("network_%s.tsv.gz", smp))
smp_dt <- mo_dt[sample == paste0(smp, "_pos")]
message("Saving: ", smp_fn)
fwrite(smp_dt, smp_fn, sep = "\t")
# all genes
addWorksheet(wb, sheetName = smp)
writeData(wb, sheet = smp, smp_dt)
# TF-TF
tfs_dt <- smp_dt[target_TF == TRUE]
tfs_orph <- smp_dt[!gene %in% tfs_dt$gene, .SD[1], gene]
tfs_orph[, target_gene := "none"]
tfs_orph[, c("target_name", "target_pfam") := ""]
tfs_orph[, c("target_expression_tpm", "target_expression_lfc", "n_samples", "expression_correlation") := 0]
tfs_dt <- rbindlist(list(tfs_dt, tfs_orph), use.names = TRUE)
addWorksheet(wb, sheetName = paste0(smp, "_TF"))
writeData(wb, sheet = paste0(smp, "_TF"), tfs_dt)
}
## Saving: results/network_Elav.tsv.gz
## Saving: results/network_Ncol.tsv.gz
## Saving: results/network_Fox.tsv.gz
saveWorkbook(
wb,
file.path(res_dir, "network.xlsx"),
overwrite = TRUE
)
Comparison with Ozment et al. 2021 eLife paper.
# load data from paper and translate gene IDs to DToL IDs
pou_act <- readWorkbook(file.path(pub_dir, "elife-74336-supp6-v3.xlsx"), startRow = 2, colNames = TRUE)
pou_rep <- readWorkbook(file.path(pub_dir, "elife-74336-supp7-v3.xlsx"), startRow = 2, colNames = TRUE)
setDT(pou_act)
setDT(pou_rep)
pou_pub <- rbindlist(list(activated = pou_act, repressed = pou_rep), idcol = "relation")
setnames(pou_pub, "gene", "ID_Vienna")
dict <- fread(file.path("annotation", "DICTIONARY_Nvec_vienna_vs_Nvec_old_vs_Nvec_DTOL_v2.txt"))[, .(ID_Vienna, ID_DToL)]
setnames(dict, "ID_DToL", "gene")
pou_pub <- merge.data.table(pou_pub, dict, all.x = TRUE, sort = FALSE)
pou_pub <- pou_pub[, .(gene, relation, fold_change, mean_counts, pvalue)]
setnames(pou_pub, "gene", "target_gene")
# load our Ncol network
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]
# overlap of Pou4 targets
pou_pub_act <- pou_pub[relation == "activated", .(target_gene, fold_change)]
setnames(pou_pub_act, "fold_change", "activated")
pou_pub_rep <- pou_pub[relation == "repressed", .(target_gene, fold_change)]
setnames(pou_pub_rep, "fold_change", "repressed")
pou_net_gen <- pou_net[, .(target_gene, target_expression_lfc)]
setnames(pou_net_gen, "target_expression_lfc", "network")
pou_dt <- merge.data.table(merge.data.table(
pou_pub_act, pou_pub_rep,
by = "target_gene", all = TRUE
), pou_net_gen, by = "target_gene", all = TRUE)
pou_mt <- as.matrix(pou_dt[, -1])
pou_mt[!is.na(pou_mt)] <- 1
pou_mt[is.na(pou_mt)] <- 0
require(eulerr)
fit <- euler(pou_mt)
p1 <- plot(
fit,
quantities = TRUE
)
print(p1)
Inspect the differences
# reverse activation and repression sign (mutants)
pou_dt[, published := -1 * activated][is.na(published), published := -1 * repressed]
# sets of target genes
missing_in_network <- unique(pou_dt[!is.na(published) & is.na(network)]$target_gene)
missing_in_network_act <- unique(pou_dt[!is.na(activated) & is.na(network)]$target_gene)
missing_in_network_rep <- unique(pou_dt[!is.na(repressed) & is.na(network)]$target_gene)
missing_in_published <- unique(pou_dt[is.na(published)]$target_gene)
target_intersect <- unique(pou_dt[!is.na(published) & !is.na(network)]$target_gene)
mo_dt <- fread(
file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
target_bound <- unique(mo_dt[sample=="Ncol_pos" & gene == pou4]$target_gene)
# expression for target genes
pou_expression_dt <- unique(ncol_dt[, .(target_gene, target_expression_lfc)])
# pou_expression_dt[target_gene %in% missing_in_network, annot := "published"]
pou_expression_dt[target_gene %in% missing_in_network_act, annot := "activated"]
pou_expression_dt[target_gene %in% missing_in_network_rep, annot := "repressed"]
pou_expression_dt[target_gene %in% missing_in_published, annot := "network"]
pou_expression_dt[target_gene %in% target_intersect, annot := "intersect"]
pou_expression_dt <- pou_expression_dt[!is.na(annot)]
pou_expression_dt[, bound := ifelse(target_gene %in% target_bound, "binding", "no binding")]
# plot
pou_expression_dt[, annot := factor(
annot, levels = c("network", "intersect", "activated", "repressed")
)]
gp_pou_exp <- ggplot(pou_expression_dt, aes(annot, target_expression_lfc, fill = bound)) +
geom_boxplot() +
ggbeeswarm::geom_quasirandom() +
geom_boxplot(outlier.color = NA, alpha = 0.5) +
scale_fill_manual(values = c("grey40", "grey80")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "target genes", y = "expression log FC")
Genes in intersect
pou_int <- pou_net[target_gene %in% target_intersect]
pou_int <- merge.data.table(
pou_int, pou_pub[, .(target_gene, relation)],
by = "target_gene", all.x = TRUE, sort = FALSE
)
pou_int[, target_TF := target_gene %in% tfs_annt$gene]
# plot
gp_pou_int <- ggplot(pou_int, aes(
expression_correlation, target_expression_lfc,
size = 1/n_samples,
shape = target_TF,
fill = relation,
label = target_name
)) +
ggrepel::geom_text_repel(color = "black") +
geom_point(color = "grey10") +
scale_size_continuous(
range = c(1, 6),
breaks = seq(1/3, 1, length.out = 3),
labels = seq(3, 1),
name = "target in number of samples\n(Elav, Ncol, Fox)"
) +
scale_shape_manual(
values = c("TRUE" = 22, "FALSE" = 21),
name = "target TF?"
) +
scale_fill_manual(
values = c("activated" = "grey90", "repressed" = "grey50"),
name = "relation"
) +
guides(
fill = guide_legend(override.aes = list(size = 5, shape = 21)),
shape = guide_legend(override.aes = list(size = 5)),
size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
) +
labs(
x = "Pou4 expression correlation",
y = "target gene expression logFC",
title = "Intersect target genes"
)
gp_pou_int
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps